Medium Blog post: https://medium.com/@manjukiruthika/under-the-lens-seattle-airbnb-listings-993d6311cf44
import datetime
import pandas as pd
import numpy as np
import folium
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as col
import matplotlib.dates as mdates
import seaborn as sns
import os
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from time import time
%matplotlib inline
processed_path = "../data/processed/"
Listings
seattle_data_listings_df = pd.read_csv("../data/seattle/listings.csv")
seattle_data_listings_df.head()
seattle_data_listings_df.shape
seattle_data_listings_df.columns
seattle_data_listings_df['host_since_in_years'] = (datetime.datetime.now()-pd.to_datetime(seattle_data_listings_df['host_since'])).astype('timedelta64[Y]')
seattle_data_listings_df
Data Preparation:
Replace % and $ sign in the below fields
seattle_data_listings_df["host_response_rate"] = seattle_data_listings_df["host_response_rate"].replace('[\%,]', '', regex=True).astype(float)
seattle_data_listings_df["price"] = seattle_data_listings_df["price"].replace('[\$,]', '', regex=True).astype(float)
Data Preparation : Removing Constants
seattle_data_listings_df = seattle_data_listings_df.loc[:,seattle_data_listings_df.apply(pd.Series.nunique) > 1]
Data Understanding: Understand the percentiles of price and availability fields
ax = sns.boxplot(seattle_data_listings_df["price"])
plt.title("Box plot - Price Field")
plt.show()
def print_percentiles(field):
"""
prints percentiles of a field
field : variable name
returns : None
"""
print(("{0}th percentile for price field is {1}").format(5, np.percentile(field,5)))
print(("{0}th percentile for price field is {1}").format(25, np.percentile(field,25)))
print(("{0}th percentile for price field is {1}").format(50, np.percentile(field,50)))
print(("{0}th percentile for price field is {1}").format(75, np.percentile(field,75)))
print(("{0}th percentile for price field is {1}").format(95, np.percentile(field,95)))
print_percentiles(seattle_data_listings_df["price"])
ax = sns.boxplot(seattle_data_listings_df["availability_365"])
plt.title("Box plot - Availability 365 Field")
plt.show()
print_percentiles(seattle_data_listings_df["availability_365"])
Load Calendar Data
seattle_data_calendar_df = pd.read_csv("../data/seattle/calendar.csv")
seattle_data_calendar_df.head()
seattle_data_calendar_df.columns
seattle_data_calendar_df["listing_id"].nunique()
def get_stats(df, save_to_file_name):
"""
Purpose: Provide statistics of fields in the dataframe
df : dataframe
save_to_file_name : save statistics results to file
"""
summary_df = df.describe(include='all').T.reset_index()
summary_df.to_csv(processed_path +save_to_file_name)
return summary_df
Calendar Data : Descriptive Statistics
get_stats(seattle_data_calendar_df, "seattle_calendar_stats.csv").head(20)
Listings Data : Descriptive Statistics
get_stats(seattle_data_listings_df, "seattle_listing_stats.csv").head(20)
Approach: From the calendar data, get all the available listing dates and price. Summing the price and grouping the data by month should mention how busy each month is.
seattle_data_calendar_df["date"] = pd.to_datetime(seattle_data_calendar_df["date"])
seattle_data_calendar_df["price"] = seattle_data_calendar_df["price"].replace('[\$,]', '', regex=True).astype(float)
Dropping Missing Values
seattle_data_calendar_df["month"] = seattle_data_calendar_df["date"].dt.month
seattle_data_calendar_group_df = seattle_data_calendar_df.loc[:,["month","price"]]
seattle_data_calendar_group_df = seattle_data_calendar_group_df.dropna()
seattle_data_calendar_group_results_df = seattle_data_calendar_group_df.groupby(["month"]).sum()
seattle_data_calendar_group_results_df = seattle_data_calendar_group_results_df.reset_index()
month_dict = {"1":"Jan",
"2":"Feb",
"3":"Mar",
"4":"Apr",
"5":"May",
"6":"Jun",
"7":"Jul",
"8":"Aug",
"9":"Sep",
"10":"Oct",
"11":"Nov",
"12":"Dec"}
seattle_data_calendar_group_results_df["month_text"] = seattle_data_calendar_group_results_df["month"].apply(lambda x: month_dict[str(x)])
seattle_data_calendar_group_results_df = seattle_data_calendar_group_results_df.sort_values(by=["price"], ascending=False)
seattle_data_calendar_group_results_df = seattle_data_calendar_group_results_df.reset_index(drop=True)
seattle_data_calendar_group_results_df
Analysis
plt.figure(figsize=(8,6))
g = sns.barplot(x=seattle_data_calendar_group_results_df.index, y="price", data=seattle_data_calendar_group_results_df)
g.set(xticklabels=list(seattle_data_calendar_group_results_df["month_text"]))
plt.xlabel("Month")
plt.ylabel("Price")
plt.title("Total Listings Price per month")
plt.show()
Observation
Month of december seems to be the most busiest indicating winter holiday period around Xmas and new year. The month of august is next busiest indicating school holiday period in summer. Jan is least busier than all the other months.
Approach: Listings have neighbourhood information.I've joined/combined listings and calendar data. Summing the price and grouping the data by neighbourhood helps one to determine how much revenue each neighbourhood is making.
seattle_data_neighbourhood = seattle_data_listings_df.loc[:,["id","neighbourhood"]].groupby(["id", "neighbourhood"]).count()
seattle_data_neighbourhood = seattle_data_neighbourhood.reset_index()
seattle_data_neighbourhood
Joining Listings and Calendar data
seattle_data_neighbourhood_calendar = pd.merge(seattle_data_calendar_df, seattle_data_neighbourhood, left_on='listing_id', right_on="id", how='left')
seattle_data_neighbourhood_calendar
seattle_data_neighbourhood_calendar_group_results_df = seattle_data_neighbourhood_calendar.loc[:,["neighbourhood","price"]].groupby(["neighbourhood"]).sum()
seattle_data_neighbourhood_calendar_group_results_df = seattle_data_neighbourhood_calendar_group_results_df.reset_index()
Analysis
seattle_data_neighbourhood_sorted_df = seattle_data_neighbourhood_calendar_group_results_df.nlargest(10,'price')
seattle_data_neighbourhood_sorted_df = seattle_data_neighbourhood_sorted_df.reset_index(drop=True)
seattle_data_neighbourhood_sorted_df
plt.figure(figsize=(8,6))
g = sns.barplot(x="price", y="neighbourhood", data=seattle_data_neighbourhood_sorted_df)
plt.xlabel("Price")
plt.ylabel("Neighbourhood")
plt.title("Top 10 Neighbourhood by Listings Revenue")
plt.show()
Observation:
Capitol hill neighbourhood seems to make the most revenue. Either it has more listings available or it has listings which are priced higher
Understanding the data type of each column in listings : Getting a list of Numeric Columns
## Finding the data type of Variables
verbose = True
numeric_cols = []
numeric_cols_with_id = []
for col in seattle_data_listings_df.columns:
s = seattle_data_listings_df[col]
if col in {'scrape_id', 'host_id', 'latitude', 'longitude'}:
continue
if s.dtype == object or s.dtype.name == 'category': # or 'Hour and date' in col:
col_type = 'Category'
#sdf = pd.get_dummies(s)
#x_col = sp.csr_matrix(sdf)
#X.append(x_col)
#names.extend(['{} = {}'.format(col, c) for c in sdf.columns])
else:
col_type = 'Numeric'
if col == 'id':
numeric_cols_with_id.append(col)
else:
numeric_cols_with_id.append(col)
numeric_cols.append(col)
#x_col = s.astype(float).fillna(0.0).values.reshape(-1, 1)
#x_col = sp.csr_matrix(MinMaxScaler().fit_transform(x_col))
#X.append(x_col)
#names.append(col.strip())
if verbose:
print('* {} - {}'.format(col.strip(), col_type))
print('* Numeric columns {}'.format(numeric_cols))
Approach : Extracting the numeric columns, I check whether there is any linear relationship between the numeric variables and price by running a correlation plot.
fig_size = (20, 20)
fig = sns.clustermap(seattle_data_listings_df.loc[:,numeric_cols].corr().fillna(0.0), annot=True, figsize=fig_size,cmap="YlGnBu")
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
plt.show()
Observation
From the hierarchical correlation plot, above one could observe that there are blocks of sections which are correlated within each other. Let us take the first block which has the price field. We could observe that price is correlated with bathrooms, bedrooms, accomodates (number of people it accomodates), beds, guests included and square feet. Negative correlation between reviews per month and price indicating that high priced properties have fewer reviews.
Approach: From the percentiles obtained for price, we divide the price into 3 ranges - low, medium and high. Any value below 25th percentile is low and between 25th and 75th percentile is medium and above 75th percentile is high. We then plot the listings and their price ranges on the map based on longitude and latitude. The idea is to observe are there any locations which have more low price listings?. Are there locations which have high price and medium price listings?
def get_class_label(price):
"""
Return a class label based on price ranges.
price: listings price value
returns : class label
"""
if price <75:
return 1
elif price >=75 and price <150:
return 2
else:
return 3
seattle_data_listings_df['price_label'] = seattle_data_listings_df['price'].apply(lambda x : get_class_label(x))
lat_long_df = pd.DataFrame()
lat_long_df['latitude'] = seattle_data_listings_df['latitude']
lat_long_df['longitude'] = seattle_data_listings_df['longitude']
lat_long_df['cluster'] = seattle_data_listings_df['price_label']
lat_long_df
m = folium.Map(location=[47.732647,-122.341301
], zoom_start=7)
colors = [
'pink',
'blue',
'green',
'orange',
'black',
'orange',
'beige',
'green',
'darkgreen',
'lightgreen',
'darkblue',
'lightblue',
'purple',
'darkpurple',
'darkred',
'cadetblue',
'gray',
'lightred'
]
def get_popup_text(label):
"""
returns meaningful description of popup
label : price label value
"""
if label==1:
return 'low'
elif label==2:
return 'medium'
else:
return 'high'
for i in range(0,lat_long_df.shape[0]):
#print(i)
folium.Marker( [lat_long_df.iloc[i]['latitude'],lat_long_df.iloc[i]['longitude']], popup=get_popup_text(int(lat_long_df.iloc[i]['cluster'])),icon=folium.Icon(color=colors[int(lat_long_df.iloc[i]['cluster'])])).add_to(m)
legend_html = '''
<div style="position: fixed; bottom: 50px; left: 50px; width: 100px; height: 90px; border:2px solid grey; z-index:9999; font-size:14px;"> Low <i class="fa fa-map-marker fa-2x" style="color:#5DADE2"></i><br> Medium <i class="fa fa-map-marker fa-2x" style=”color:#64C714”></i><br> High Price <i class="fa fa-map-marker fa-2x" style="color:#D68910"></i></div>'''
m.get_root().html.add_child(folium.Element(legend_html))
m

m.save('seattle_folium_map.html')
Observation: Studying the map, one could observe prevalance of low/medium/high listings. For instance, around Univeristy of Washington there are many low and medium priced listings. Around capitol hill area, there are more prevalance of medium and high price listings
Approach : In the earlier section of code, i considered only the numeric variables to understand their effect on price. Now combining the categorical variables like the property, host and review information - I check whether we are able to predict the price range of a property as low / medium and high? Instead of predicting price with the small volume of data, I've turned it into a classification problem where I'm predicting price ranges(low/medium/high). I choose a machine learning algorithm - Random Forests Classifier. This tree based algorithm, could handle combination of categorical and numeric data to help predict the price ranges. The data is split into 80% train and 20% test sets. 5 fold cross validation is carried out on the training data for different number of estimators. The optimal hyperparameter is chosen and the algorithm is trained with the optimal parameter. It is then tested on 20% data. Accuracy metric is used for evaluation of the algorithm.
data_df = seattle_data_listings_df.loc[:,['property_type', 'room_type', 'accommodates', 'host_response_time', 'host_response_rate', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'host_listings_count', 'guests_included', 'number_of_reviews','minimum_nights','maximum_nights', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness','review_scores_checkin', 'review_scores_communication','review_scores_location', 'review_scores_value','reviews_per_month','availability_365','host_since_in_years','price_label']]
# data_df.loc[data_df['host_response_time'].isnull(),'host_response_time']='missing'
# data_df.loc[data_df['host_response_rate'].isnull(),'host_response_rate']=0
# data_df.loc[data_df['review_scores_rating'].isnull(),'review_scores_rating']=0
# data_df.loc[data_df['review_scores_accuracy'].isnull(),'review_scores_accuracy']=0
# data_df.loc[data_df['review_scores_cleanliness'].isnull(),'review_scores_cleanliness']=0
# data_df.loc[data_df['review_scores_checkin'].isnull(),'review_scores_checkin']=0
# data_df.loc[data_df['review_scores_communication'].isnull(),'review_scores_communication']=0
# data_df.loc[data_df['review_scores_location'].isnull(),'review_scores_location']=0
# data_df.loc[data_df['review_scores_value'].isnull(),'review_scores_value']=0
# data_df.loc[data_df['reviews_per_month'].isnull(),'reviews_per_month']=0
#data_df.isnull().sum()
Handle Categorical Data
data_df = pd.get_dummies(data_df)
data_df
columns_minus_price = list(set(data_df.columns)-set(['price','price_label']))
columns_minus_price
Dropping nulls
data_df = data_df.dropna()
data_df.shape
Classification algorithm to detect price ranges
best_params = {}
parameters = {'n_estimators': [25, 50, 100, 150, 200, 250]}
X_train, X_test, y_train, y_test = train_test_split(data_df.loc[:,columns_minus_price], data_df['price_label'],
train_size=0.8,
random_state=0)
# X_train, X_test, y_train, y_test = train_test_split(data_df.loc[:,columns_minus_price], data_df['price_label'],train_size=0.8)
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=0, max_depth=5)
grid_search = GridSearchCV(rf_clf, parameters, scoring='f1_macro', cv=5, verbose=10)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params
best_model = RandomForestClassifier(n_estimators=best_params['n_estimators'], random_state=0, class_weight="balanced")
#best_model = RandomForestClassifier(n_estimators=best_params['n_estimators'])
best_model.fit(X_train, y_train)
# Evaluate the model using the full dataset# Evalua
from sklearn.metrics import accuracy_score
p = best_model.predict(X_train)
print("Training Accuracy", accuracy_score(y_train, p)) # sanity check
# Evaluate the model using the full dataset
p = best_model.predict(X_test)
print("Test Accuracy", accuracy_score(y_test, p)) # sanity check
indices = np.argsort(best_model.feature_importances_)[::-1]
print(indices)
print([columns_minus_price[i] for i in indices])
columns_n_importances = [(columns_minus_price[i], best_model.feature_importances_[i]) for i in indices]
plt.figure(figsize=(8,10))
g = sns.barplot(x=[i[1] for i in columns_n_importances[:20]], y=[i[0] for i in columns_n_importances[:20]], data=seattle_data_calendar_group_results_df)
plt.xlabel("Feature Importance")
plt.ylabel("Fields")
plt.title("Top 20 features to determine listing price ranges (low/medium/high)")
plt.show()
Observation:
Based on our experiment on using a classification algorithm for detecting the price ranges, one could observe that the test set has an accuracy ~75%. The features are ranked based on their importance and the visualisation shows the top 20 variables which have an impact on determining the price range. We could observe that room type entire home/appt, reviews per month, number of bedrooms and availability all have an impact on determining the price ranges.
Conclusion and Remarks: Either with more data and better algorithm one could improve on the classification results of predicting price ranges. Data has room for further analysis - checking how temporal information has an impact on price etc.